#!/usr/bin/env python2.3 # # Validation Test.py # # This program takes in an expression data file # and a BPM network and outputs the result of # the validation test described in "Evaluating # Between-Pathway Models with Expression Data", # published in RECOMB 2009 # # Created by Max Leiserson on 7/30/08. # import time import sys import string import random import os def readBPMs (response): FILE = open (response, 'r') line = 'not null' partition1 = [] partition2 = [] i = 0 while line != '': line = FILE.readline() line = line.rstrip() array = line.split ("\t") if ((i %2) == 0): partition1.append (array) else: partition2.append (array) i = i + 1 total = [] total.append (partition1) total.append (partition2) response = response.strip('BPM Networks/') response = response.rstrip('.tx') if response.count('KI ') > 0: currentTime = 'Output/KI Pruned' + '/' + time.strftime("%d%b%Y", time.localtime()) +' ' + time.strftime("%H_%M_%S", time.localtime()) os.mkdir (currentTime) os.chdir (currentTime) else: currentTime = 'Output/' + response + '/' + time.strftime("%d%b%Y", time.localtime()) +' ' + time.strftime("%H_%M_%S", time.localtime()) os.mkdir (currentTime) os.chdir (currentTime) print 'Dataset:', response return total # retrieve all the KO data from the hughes dataset def getKnockouts(expression_data): #open file geneList = [] knockoutInfo =[] prunedList = [] #all the genes in the hughes file #fix later - read until file ends for i in range(0, 4): geneList.append(expression_data.readline()) #print geneList[2] tempGenes = geneList[2].split('\t') del tempGenes[0:4] hughGeneList=[] indices = [] #put gene column titles *top row* stored at 0 for i in range(0, len(tempGenes)): if(i%4 == 0): genes = [] if (tempGenes[i].count(' (') > 0): arr = tempGenes[i].split(' (') gene = arr[0].upper() else: gene = tempGenes[i].upper() if (gene.count(',')): arr2 = gene.split(',') if (arr2[0].count('"') > 0): arr3 = arr2[0].split('"') genes.append(arr3[1].upper()) else: genes.append(arr2[0]) if (arr2[1].count('"') > 0): arr4 = arr2[1].split('"') genes.append(arr4[0].upper()) else: genes.append(arr2[1].upper()) else: genes.append(gene) hughGeneList.append(genes) prunedList.append(genes[0]) indices.append(i/4) if (len(genes) > 1): prunedList.append(genes[1]) indices.append(i) # put a list of all the genes tested in hughes in index 0 knockoutInfo.append(prunedList) next = geneList[3].split('\t') del next[0:2] stats = next[0:4] #put stat titles in knockout info at index 1 knockoutInfo.append(stats) #put knockout gene *first column* in knockout info in index 2 knockouts = [] line = expression_data.readline() while line != '': data = line.split('\t') knockouts.append(data[0].upper()) line = expression_data.readline() knockoutInfo.append(knockouts) knockoutInfo.append(hughGeneList) knockoutInfo.append(indices) return knockoutInfo def readRegistryGenenames(): #opens the file of genenames and common names, and creates a dictionary where one can find the common name by entering the genename openFile = open ('../../../registry.genenames.tab', 'r') lines = [] genenames = {} for line in openFile: lines.append (line) for item in lines: anArray = item.rsplit ('\t') genenames[anArray[-2]] = anArray[0] # in the file, the first item is the common name and the second to last item is the genename return genenames def ourGeneList(bpmList): ourList = [] ourListY = [] ourListCommon = [] for i in range(0, len(bpmList[0])): for j in range(0, len(bpmList[0][i])): if(ourListY.count(bpmList[0][i][j]) == 0): ourListY.append(bpmList[0][i][j]) for i in range(0, len(bpmList[1])): for j in range(0, len(bpmList[1][i])): if(ourListY.count(bpmList[1][i][j]) == 0): ourListY.append(bpmList[1][i][j]) for i in range(0, len(bpmList[2])): for j in range(0, len(bpmList[2][i])): if(ourListCommon.count(bpmList[2][i][j]) == 0): ourListCommon.append(bpmList[2][i][j]) for i in range(0, len(bpmList[3])): for j in range(0, len(bpmList[3][i])): if(ourListCommon.count(bpmList[3][i][j]) == 0): ourListCommon.append(bpmList[3][i][j]) ourList.append(ourListY) ourList.append(ourListCommon) return ourList def theirListLocations(ourList, results): theirList = [] theirListY = [] theirListCommon = [] for i in range(0, len(ourList[0])): if(results.count(ourList[0][i]) != 0): location = results.index(ourList[0][i]) theirListY.append(location) else: theirListY.append(-1) for i in range(0, len(ourList[1])): if(results.count(ourList[1][i]) != 0): location = results.index(ourList[1][i]) theirListCommon.append(location) else: theirListCommon.append(-1) theirList.append(theirListY) theirList.append(theirListCommon) return theirList def getHughesColumn(expression_data, offset): expression_data.seek(0,0) returnData = [] for k in range (0, 4): junk = expression_data.readline() line = expression_data.readline() while line != '': temp = [] data = line.split('\t') temp.append(data[0]) temp.append(data[offset*4+2]) temp.append(data[offset*4+3]) temp.append(data[offset*4+4]) temp.append(data[offset*4+5]) returnData.append(temp) line = expression_data.readline() return returnData def getSortedMicroArray(inputMicroArray): micro = [] for j in range(0, len(inputMicroArray)): temp = [] if inputMicroArray[j][2] != ' NaN': temp.append(float(inputMicroArray[j][2])) #convert the third column to a float, take its absolute value, and append it to temp temp.append(inputMicroArray[j][0]) micro.append(temp) micro.sort() #sort the array micro.reverse() #reverse it, so it is in the correct order return micro #comments # getSortedArray does the same thing as getSortedMicroArray, except it doesn't include the names in the array def getSortedArray (inputMicroArray): micro = [] for j in range(0, len(inputMicroArray)): if inputMicroArray[j][2] != ' NaN': micro.append(float(inputMicroArray[j][2])) micro.sort() micro.reverse() return micro def getSortedMicroArrayAbs(inputMicroArray): micro = [] for j in range(0, len(inputMicroArray)): temp = [] if inputMicroArray[j][2] != ' NaN': temp.append(abs(float(inputMicroArray[j][2]))) #convert the third column to a float, take its absolute value, and append it to temp temp.append(inputMicroArray[j][0]) micro.append(temp) micro.sort() #sort the array micro.reverse() #reverse it, so it is in the correct order return micro #comments # getSortedArray does the same thing as getSortedMicroArray, except it doesn't include the names in the array def getSortedArrayAbs (inputMicroArray): micro = [] for j in range(0, len(inputMicroArray)): if inputMicroArray[j][2] != ' NaN': micro.append(abs(float(inputMicroArray[j][2]))) micro.sort() micro.reverse() return micro #comments #inputArray is a sorted micro array #geneset is the overall gene set to test #index is the index up to which N_r will be calculated def calculateN_r(inputArray, geneList): N_r = 0 for i in range(0,len(inputArray)): for j in range(0, len(geneList)): #print '---' #print 'inputArray ', inputArray[i][1] #print '--' #print 'geneList ', geneList[j] if inputArray[i][1] == geneList[j]: N_r += abs(inputArray[i][0]) if N_r == 0: print 'ERROR with N_r count' return -1 else: return float(N_r) #comments # add up the absolute values of the log10 values for an inputArray def calculateN_r2 (inputArray): total = 0.0 for i in range (len (inputArray)): total += abs(inputArray[i][0]) return total def generanks (sorted, genelist): array = [] n = float(len(sorted)) sortedNames = [] sorted.reverse() for i in range (len (sorted)): sortedNames.append (sorted[i][1]) for item in genelist: temp = [] temp.append (((((n/2) - float(sortedNames.index (item))) / (n/2)))) temp.append (float(sortedNames.index (item))) temp.append(item) array.append (temp) array.sort() array.reverse() return array def generanksAbs (sorted, genelist): array = [] sorted.reverse() sortedNames = [] newSorted = [] numberBefore = 0 numberDistinct = 0 currentDupes = 0 currentValue = sorted[0][0] for z in range(len(sorted)): temp = [] if(sorted[z][0] != currentValue): numberBefore +=currentDupes currentDupes = 0 numberDistinct +=1 temp.append(numberBefore) temp.append(sorted[z][0]) temp.append(sorted[z][1]) newSorted.append(temp) numberBefore +=1 else: currentDupes +=1 temp.append(numberBefore) temp.append(sorted[z][0]) temp.append(sorted[z][1]) newSorted.append(temp) currentValue = sorted[z][0] finalSort = [] rankWeight = len(sorted) currentDupes = 0 currentValue = sorted[0][0] for i in range(len(sorted)): temp = [] if(sorted[i][0] != currentValue): rankWeight -= currentDupes currentDupes = 0 temp.append(rankWeight) temp.append(newSorted[i][0]) temp.append(newSorted[i][1]) temp.append(newSorted[i][2]) finalSort.append(temp) rankWeight -=1 else: currentDupes += 1 temp.append(rankWeight) temp.append(newSorted[i][0]) temp.append(newSorted[i][1]) temp.append(newSorted[i][2]) finalSort.append(temp) currentValue = sorted[i][0] returnSet = [] for k in range (len(finalSort)): if(counter(genelist, finalSort[k][3]) != 0): returnSet.append(finalSort[k]) return returnSet def counter(list, item): count = 0 for i in range(0, len(list)): if (list[i] == item): count +=1 return count def gsea(inputMicroArray, geneList, gseaType): #sort inputMicroArray by log10 score sorted = getSortedMicroArray(inputMicroArray) #switch it to smallest to biggest #----optimize later sorted.reverse() geneRanks = generanksAbs(sorted, geneList) maxDiff = 0 answer = [] tempAnswer = 0 runningHit = 0 temp = 0 nr = calculateN_r2(geneRanks, gseaType) N = float(len(inputMicroArray)) Nh = float(len(geneRanks)) for i in range(0, len(geneRanks)): #test right before we see a new element - #misses have reached their max if (N == 0): N = 1 if (nr == 0): nr = 1 try: temp = (runningHit/nr) - ((geneRanks[i][1])/N) except ZeroDivisionError: temp = 0 if (abs(temp) > maxDiff): maxDiff = abs(temp) tempAnswer = temp #add the element we are on runningHit += abs(geneRanks[i][2]) temp = (runningHit/nr) - ((geneRanks[i][1])/N) if (abs(temp) > maxDiff): maxDiff = abs(temp) tempAnswer = temp answer.append(tempAnswer) return answer def gsea2(inputMicroArray, geneList): #sort inputMicroArray by log10 score sorted = getSortedMicroArray(inputMicroArray) #switch it to smallest to biggest #----optimize later sorted.reverse() geneRanks = generanks(sorted, geneList) maxDiff = 0 answer = [] tempAnswer = 0 runningHit = 0 nr = calculateN_r2(geneRanks, '2') N = float(len(inputMicroArray)) Nh = float(len(geneRanks)) for i in range(0, len(geneRanks)): #test right before we see a new element - #misses have reached their max temp = (runningHit/nr) - ((geneRanks[i][1] - i -1)/N) if (abs(temp) > maxDiff): maxDiff = abs(temp) tempAnswer = temp #add the element we are on runningHit += abs(geneRanks[i][0]) temp = (runningHit/nr) - ((geneRanks[i][1] - i)/N) if (abs(temp) > maxDiff): maxDiff = abs(temp) tempAnswer = temp answer.append(tempAnswer) return answer def compute_rank_cluster_score(inputMicroArray, geneList): #sort inputMicroArray by log10 score sorted = getSortedMicroArrayAbs(inputMicroArray) #switch it to smallest to biggest #----optimize later sorted.reverse() geneRanks = generanksAbs(sorted, geneList) maxDiff = 0 answer = [] tempAnswer = 0 runningHit = 0 temp = 0 nr = calculateN_r2(geneRanks) N = float(len(sorted)) # sorted does not include the NaN values, which should be ignored when calculating misses Nh = float(len(geneRanks)) for i in range(0, len(geneRanks)): #test right before we see a new element - #misses have reached their max temp = (runningHit/nr) - ((geneRanks[i][1])/N) if (abs(temp) > maxDiff): maxDiff = abs(temp) tempAnswer = temp #add the element we are on runningHit += abs(geneRanks[i][0]) temp = (runningHit/nr) - ((geneRanks[i][1])/N) if (abs(temp) > maxDiff): maxDiff = abs(temp) tempAnswer = temp answer.append(tempAnswer) return answer def get_random_geneset_data(inputMicroArray, geneList): random.seed() pRaw = [] #because we want no NaN reuse sort - but don't reuse it can't be processed twice #in v2.0 write a clean function sorted = getSortedMicroArrayAbs (inputMicroArray) for i in range(0, 99): # get a random set of genes randomSet = [] while len(randomSet) < len(geneList): candidate = random.randint(0, len(sorted)-1) count = 0 for j in range(0, len(geneList)): if (sorted[candidate][1] == geneList[j]): count +=1 if count == 0: #print 'adding ', sorted[candidate][1], ' to list' randomSet.append(sorted[candidate][1]) pRaw.append (compute_rank_cluster_score(inputMicroArray, randomSet)) return pRaw def get_random_knockout_data(expression_data, knockOutPathway, knockOutPathwayY, EsPathway, EsPathwayY, knockOutList): #get random set of test - none of which are in knockout pathway #calculate the opposite pathway's es value random_CRS_scores = [] for i in range(0, 99): found = 0 rIndex = 0 random.seed() while found == 0: count = 0 rIndex = random.randint(0,len(knockOutList)-1) for a in range(len(knockOutList[rIndex])): count += counter(knockOutPathway, knockOutList[rIndex][a]) count += counter(knockOutPathwayY, knockOutList[rIndex][a]) count += counter(EsPathway, knockOutList[rIndex][a]) count += counter(EsPathwayY, knockOutList[rIndex][a]) if count == 0: found = 1 break #print 'attempting index is', rIndex random_hughes_column = getHughesColumn(expression_data, rIndex) random_CRS_scores.append (compute_rank_cluster_score(random_hughes_column, EsPathwayY)) return random_CRS_scores def calculate_p_value(CRS, random_CRS_scores): # once all the random values have been appended to random_CRS_scores, # sort random_CRS_scores so as to make finding the CRS's rank in random_CRS_scores possible random_CRS_scores.sort() p = 0.0 # for each item in random_CRS_scores, if p is greater than the CRS, incrememnt p by 1 for a in range (0, len(random_CRS_scores)): if (CRS > random_CRS_scores[a]): p += 1.0 # add an extra 1 to p, to account for the fact that counting in computers starts at 0 p += 1.0 # divide get_random_geneset_data by 100, effectively making it a p value p = p/100.0 return 1 - p def collect_bpms_that_meet_criteria(suppsInfo, knockoutInfo): outputFileString = 'All BPMs.html' # create a file in the directory that was previously created by the program that stores the list of BPMs outputFile = open(outputFileString, 'w') # that meet the required specifications outputFile.write ('\n All BPMs.html \n

') highlight = 0 genenames = readRegistryGenenames() pathway1 = [] pathway2 = [] print 'Total BPMs: ', len(suppsInfo[1]) for i in range (len(suppsInfo[1])): # for every BPM read in either from the user or from file (suppsInfo[1] is the same length as suppsInfo[0]) temp1 = [] temp2 = [] testCounter1 = 0 testCounter2 = 0 for j in range (len(suppsInfo[0][i])): # for every member of a pathway in the first column of the current BPM try: testCounter1 += knockoutInfo[0].count (genenames[suppsInfo [0][i][j]]) # record the total number of tests peformed upon any gene in the pathway except KeyError: # i.e. count the number of genes in the pathway that were knocked out try: testCounter1 += knockoutInfo[0].count (suppsInfo [0][i][j]) except KeyError: counter +=1 misses.append (suppsInfo[0][i][j]) pass for k in range (len(suppsInfo[1][i])): # for every member of a pathway in the second column of the current BPM try: testCounter2 += knockoutInfo[0].count (genenames[suppsInfo[1][i][k]]) # record the total number of tests performed upon any gene in the pathway except KeyError: # i.e. count the number of genes in the pathway that were knocked out try: testCounter2 += knockoutInfo[0].count (suppsInfo[1][i][k]) except KeyError: counter += 1 misses.append (suppsInfo[1][i][j]) pass if (testCounter1 > 0 and len (suppsInfo[1][i]) > 2) or (testCounter2 > 0 and len (suppsInfo[0][i]) > 2): # if pathway 1 contains a gene that was tested and pathway 2 is longer than 2, or if pathway 2 contains a gene that was tested and pathway 1 is longer than 2, # add both pathway 1 and pathway 2 as a single BPM to the BPM list outputFile.write ('\nBPM ') outputFile.write (str(i)) outputFile.write ("(") outputFile.write (str(highlight)) outputFile.write (")!!
") if (testCounter1 > 0 and len (suppsInfo[1][i]) > 2) and (testCounter2 <= 0 or len (suppsInfo[0][i]) <= 2): outputFile.write ('') outputFile.write (str (suppsInfo[0][i])) outputFile.write ('
') outputFile.write (str (suppsInfo[1][i])) elif (testCounter2 > 0 and len(suppsInfo[0][i]) > 2) and (testCounter1 <= 0 or len (suppsInfo[1][i]) <= 2): outputFile.write (str(suppsInfo[0][i])) outputFile.write ('
') outputFile.write (str (suppsInfo[1][i])) outputFile.write ('') else: outputFile.write ('') outputFile.write (str (suppsInfo[0][i])) outputFile.write ('
') outputFile.write (str (suppsInfo[1][i])) outputFile.write ('
') highlight += 1 outputFile.write ('



') pathway1.append (suppsInfo[0][i]) # adding pathway 1 to the list of pathways 1 pathway2.append (suppsInfo[1][i]) # adding pathway 2 to the list of pathways 2 (since there is a one-to-one correspondence between pathway1s and pathway # 2s, these pathways will appear at the same index in the array of all BPMs) else: outputFile.write ("BPM ") outputFile.write (str(i)) outputFile.write ("!!
") outputFile.write (str (suppsInfo[0][i])) outputFile.write ('
') outputFile.write (str (suppsInfo[1][i])) outputFile.write ('


') outputFile.write ('

') total = [] total.append(pathway1) total.append(pathway2) return total #comments # compileBPMsY is called by run in order to create an array of all BPMs that meet the minimum requirements. In this case, each BPM must have at least 1 test opposite # a BPM of length at least 3. Once that requirement is met, both sides of the BPM are added to a BPM list, where column one represents pathway1 and column two # represents pathway 2. ALl the items in both columns of this list are the genenames (Y-names), not the common names. def compileBPMsY(rawResponse, knockouts): suppsInfo = readBPMs(rawResponse) total = [] counter = 0 misses = [] collection = collect_bpms_that_meet_criteria (suppsInfo, knockouts) total.append (collection[0]) #all the first pathways in the appropriate BPMs total.append (collection[1]) #all the second pathways in the appropriate BPMs print 'BPMs that meet criteria: ', len (collection[0]) if rawResponse == "6": outputValidBpms() return total # return the list of BPMs #comments # compileBPMsCommon is a method called by run after run called compileBPMsY. The method takes in a list called suppsInfo, # which is the same list that compileBPMsY returned. The entire purpose of compileBPMsCommon is to create an exact # copy of suppsInfo, except with the common names instead of the genenames (Y-names) def compileBPMsCommon (suppsInfo): total = [] genenames = [] genenames = readRegistryGenenames() pathway3 = [] pathway4 = [] for n in range(len(suppsInfo[0])): # for every BPM in suppsInfo... temp3 = [] temp4 = [] for o in range (len(suppsInfo[0][n])): # for every gene in pathway 1 of each BPM try: temp3.append (genenames[suppsInfo[0][n][o]]) # append the common name to the pathway 1 list of common names except KeyError: temp3.append ('NULL') for p in range (len(suppsInfo[1][n])): # for every gene in pathway 2 of each BPM try: temp4.append (genenames[suppsInfo[1][n][p]]) # append the common name to the pathway 2 list of common names except KeyError: temp4.append ('NULL') pathway3.append (temp3) pathway4.append (temp4) total.append (pathway3) total.append (pathway4) return total # return the total list of BPMs denoted by their common names #comments # compileYTestLocations takes in a list named suppsInfo, which is the exact list returned by compileBPMsY when called by # run. compileYTestLocations then iterates through every BPM, and finds the test locations for each gene in each BPM. # Since some of the names of the tests are denoted by common names and some are noted by genename, this step is necessary # for easy access to the tests later on. A -1 is put into the list for each gene that doesn't have a test, either because # the name of the test is denoted with a common name, or the gene simply doesn't have a test. def compileYTestLocations (suppsInfo, knockoutInfo): pathway5 = [] pathway6 = [] total = [] indices = knockoutInfo[4] for i in range (len(suppsInfo[0])): #for each BPM temp1 = [] temp2 = [] for l in range (len(suppsInfo[0][i])): # for each item in pathway 1 of the current BPM try: temp1.append (indices[knockoutInfo[0].index (suppsInfo[0][i][l])]) # finds the index of a test in the knockoutInfo except ValueError: # and appends it to the list, unless there is no temp1.append (-1) # test, in which case -1 is appended for m in range (len(suppsInfo[1][i])): # for each item in pathway 2 of the current BPM try: temp2.append (indices[knockoutInfo[0].index (suppsInfo[1][i][m])]) # finds the index of a test in the knockoutInfo except ValueError: # and appends it to the list, unless there is temp2.append ( -1) # no test, in which case -1 is appended pathway5.append (temp1) pathway6.append (temp2) total.append (pathway5) total.append (pathway6) return total # return the combined lists def compileCommonTestLocations (suppsInfo, knockoutInfo): total = [] genenames = [] genenames = readRegistryGenenames() pathway6 = [] pathway7 = [] indices = knockoutInfo[4] for n in range(len(suppsInfo[0])): # for each BPM temp6 = [] temp7 = [] for o in range (len(suppsInfo[0][n])): # for each item in pathway 1 of the current BPM try: tempString = genenames[suppsInfo[0][n][o]] # finds the common name of the current gene, because suppsInfo except KeyError: # came directly from compileBPMsY, and this method is looking tempString = 'Absolutely not there' # for tests denoted by common names try: temp6.append (indices[knockoutInfo[0].index (tempString)]) # finds the index of a test in the knockoutInfo except ValueError: # and appends it to the list, unless there is no test, in which temp6.append (-1) # case -1 is appended for p in range (len(suppsInfo[1][n])): try: tempString2 = genenames[suppsInfo[1][n][p]] # finds the common name of the current gene, because suppsInfo except KeyError: # came directly from compileBPMsY, and this method is looking tempString2 = 'Absolutely not there' # for tests denoted by common names try: temp7.append (indices[knockoutInfo[0].index (tempString2)]) # finds hte index of a test in the knockoutInfo and except ValueError: # appends it to the list, unless there is not test, in temp7.append (-1) # which case -1 is appended pathway6.append (temp6) pathway7.append (temp7) total.append (pathway6) total.append (pathway7) return total # return the combined lists # Ask the user which process with BPMs s/he would like to perform def determine_file_path (): print '----------------BPM Validation Test----------------\n' print 'Please enter the file path for the BPMs\n\n' response = raw_input("") response = 'BPM Networks/' + response return response # compile a 3D array of BPMs and their locations with their Y and common genenames def create_bpm_list(response, knock): temp1 = compileBPMsY (response, knock) # compile the list of all BPMs, and have them all be represented with their genenames temp2 = compileBPMsCommon (temp1) # compile the list of all BPMs represented by their common names from the list of # BPMs denoted with their genenames temp3 = compileYTestLocations (temp1, knock) # find the test locations for all BPMs denoted by their genenames temp4 = compileCommonTestLocations (temp1, knock) # find the test locations for all BPMs denoted by their common names bpmList = [] bpmList.append (temp1[0]) bpmList.append (temp1[1]) bpmList.append (temp2[0]) bpmList.append (temp2[1]) bpmList.append (temp3[0]) bpmList.append (temp3[1]) bpmList.append (temp4[0]) bpmList.append (temp4[1]) return bpmList # using the responses from the user, compile the output file strings for use in run() def get_output_file_strings(gseaType): outputFileStrings = [] pFileString = 'Random Geneset P Values.txt' pFileKOString = 'Random KO P Values.txt' outputFileString = 'Output.html' prunedPFileString = 'Pruned Random Geneset P Values.txt' prunedPFileKOString = 'Pruned Random KO P Values.txt' outputFileStrings.append(pFileString) outputFileStrings.append(outputFileString) outputFileStrings.append(prunedPFileString) outputFileStrings.append(pFileKOString) outputFileStrings.append(prunedPFileKOString) return outputFileStrings def write_outputfile_header(outputFile, pruneCut, response): response = response.strip('BPM Networks/') response = response.rstrip('.tx') outputFile.write ('<body><h2 alignment: center> BPMs ') outputFile.write ('</h2><br/><h3>' + 'Prune Cut-Off: ' + pruneCut) outputFile.write ('</h3><br/><br/>\n') def write_current_bpm_header(currentBPM, outputFile): outputFile.write ('<p><FONT face=\"Verdana\" size=\"1\">Current BPM ') outputFile.write(str(currentBPM)) outputFile.write ("<br/>") def initialize_genelist(bpmList, j, side, currentBPM): genelist = [] for i in range (len(bpmList[side][currentBPM])): genelist.append (bpmList[side][currentBPM][i]) genelist.remove (bpmList[side][currentBPM][j]) return genelist def get_log_sorted_values(temp0, gseaType): if gseaType == "1" or gseaType == "2": sorted = getSortedMicroArray (temp0) # create a sorted array of the log10 values from the hughes column else: sorted = getSortedMicroArrayAbs (temp0) # create a sorted array of the log10 values from the hughes column taking the absolute values of the log10s return sorted def get_gene_ranks_KO(sorted, bpmList, currentBPM, gseaType, side): if gseaType == "1" or gseaType == "2": logs1 = generanksAbs (sorted, bpmList[side][currentBPM]) #generate list of all the gene ranking data from the generanksAbs function for the KO pathway else: logs1 = generanksAbs (sorted, bpmList[side][currentBPM]) #generate list of all the gene ranking data from the generanksAbs function for the KO pathway return logs1 def get_gene_ranks_compensatory(sorted, bpmList, currentBPM, gseaType, side): if gseaType == "1" or gseaType == "2": logs2 = generanksAbs (sorted, bpmList[side][currentBPM]) #generate list of all the gene ranking data from the generanksAbs function for the current pathway else: logs2 = generanksAbs (sorted, bpmList[side][currentBPM]) #generate list of all the gene ranking data from the generanksAbs function for the current pathway return logs2 def set_genenames_KO(logs1): names1 = [] for name in logs1: names1.append (name[3]) return names1 def set_genenames_compensatory(logs2): names2 = [] for name2 in logs2: names2.append (name2[3]) return names2 def write_current_knockout_stats(bpmList, currentBPM, j, outputFile, sorted, side, otherside): outputFile.write('-----Knockout Gene:') # print the gene knocked out, both its genename and common name outputFile.write(str(bpmList[side][currentBPM][j])) # the knockout's genename outputFile.write(' (') outputFile.write(str(bpmList[side + 2][currentBPM][j])) # the knockout's common name outputFile.write (')') outputFile.write('-----<br/><br/>The length of the pathway opposite the knockout: ') outputFile.write (str(len(bpmList[otherside][currentBPM]))) # print the length of the pathway opposite the knockout outputFile.write ("<br/><br/>The length of the list of log10 values: ") outputFile.write (str(len(sorted))) # the length of the list of all log10 values, adjusted depending on the number of NaNs outputFile.write ("<br/><br/>") outputFile.write("<pre>Pathway 1\t\t\tlog10\t\t# After\t\t# Before\t\t\t\tPathway 2\t\t\tlog10\t\t# After\t\t# Before<br/><br/>\n") def determine_shorter_pathway(bpmList, currentBPM): if (len (bpmList[0][currentBPM]) < len (bpmList[1][currentBPM])): # determine which list is shorter shorter = 0 else: shorter = 1 return shorter def determine_longer_pathway(currentBPM, bpmList): if (len (bpmList[0][currentBPM]) < len (bpmList[1][currentBPM])): # determine which list is shorter longer = 1 else: longer = 0 return longer def write_item_KO(bpmList, currentBPM, j, item, outputFile, a, side): #CHANGE FROM => if (item == bpmList[side][currentBPM][a] or item == bpmList[side][currentBPM][j]): if (bpmList[side][currentBPM][a] == bpmList[side][currentBPM][j]): # if the current item in the pathway 1 is the knockout, underline it (item is always the genename) outputFile.write ('<u>') outputFile.write (item) outputFile.write ('</u>') else: # otherwise just write the item (item is always the genename) outputFile.write (item) def write_item_compensatory(bpmList, currentBPM, a, item, outputFile, side, good): if (good == "yes"): outputFile.write("<span style='background:yellow;'>") outputFile.write (bpmList[side][currentBPM][a]) # print the opposite pathway's (pathway 2) ath gene's genename outputFile.write ('(') outputFile.write (bpmList[side + 2][currentBPM][a]) # print the opposite pathway's (pathway 2) ath gene's common name outputFile.write (')') def write_items_common_name(bpmList, currentBPM, a, j, outputFile, side): outputFile.write ('(') if (bpmList[side][currentBPM][a] == bpmList[side][currentBPM][j]): # if the ath gene in the shorter pathway is the knockout, print the common name of the # knocked out gene and underline it outputFile.write ('<u>') outputFile.write (bpmList[side][currentBPM][a]) outputFile.write ('</u>') else: # otherwise just output the common name for the ath gene in the shorter pathway outputFile.write (bpmList[side][currentBPM][a]) outputFile.write(')') def write_log_and_rank(temp0, x, currentBPM, bpmList, outputFile, logs1, names1, a, side): if ((bpmList[side][currentBPM][a] == temp0[x][0]) or (bpmList[side+2][currentBPM][a] == temp0[x][0])): # if the genename/common name of the ath gene in the pathway (pathway 1) is equal to the name # stored in the zth list in temp 0[z][0] (temp0[z][0] holds the name of the gene whose information # is stored in temp0[z][1], temp0[z][2], and temp0[z][3] for the current list z) outputFile.write ('\t\t\t') outputFile.write (temp0[x][2] + '\t\t') # write the information stored in the third column (log10 value) for the current gene try: # try to print the rank for the curent gene's log10 value among all the log10 values calculated and the # of log10 values after the gene # for the current knockout (this calculation is done in generanksAbs previously). outputFile.write (str(logs1[names1.index (temp0[x][0])][0]) + '\t\t' + str(logs1[names1.index (temp0[x][0])][1])) return logs1[names1.index (temp0[x][0])][0] except ValueError: outputFile.write ("N/A\t\tN/A") return -0.001 else: return 0 def get_rank(temp0, x, currentBPM, bpmList, outputFile, logs1, names1, a, side): if ((bpmList[side][currentBPM][a] == temp0[x][0]) or (bpmList[side+2][currentBPM][a] == temp0[x][0])): try: return logs1[names1.index (temp0[x][0])][0] except ValueError: return 0 else: return 0 def write_scores(outputFile, pFile, pFileKO, CRS_left, CRS_right, random_genesets_pValLeft, random_genesets_pVal, random_knockouts_pValLeft, random_knockouts_pVal, currentBPM): outputFile.write ("<br/><br/>ES Random Genesets = ") outputFile.write (str(CRS_left)) #output the value calculated by the compute_cluster_rank_score method for the KO pathway outputFile.write ("\t\tES Random KOs = ") outputFile.write (str(CRS_left)) #output the value calculated by the compute_cluster_rank_score method for the KO pathway outputFile.write ("\t\tES Random Genesets = ") outputFile.write (str(CRS_right)) #output the value calculated by the compute_cluster_rank_score method for the opposite pathway outputFile.write ("\t\t\tES Random KOs = ") outputFile.write (str(CRS_right)) #output the value calculated by the compute_cluster_rank_score method for the opposite pathway outputFile.write ("<br/><br/>p Random Genesets = ") outputFile.write (str(random_genesets_pValLeft)) # output the p value calculated by the get_random_geneset_data method for the KO pathway (see above) outputFile.write ("\t\t\t\tp Random KOs = ") outputFile.write (str(random_knockouts_pValLeft)) # output the p value calculated by the get_random_knockout_data method for the current pathway (see above) outputFile.write ("\t\t\t\tp Random Genesets = ") outputFile.write (str(random_genesets_pVal)) # output the p value calculated by the get_random_geneset_data method for the KO pathway (see above) outputFile.write ("\t\t\t\tp Random KOs = ") outputFile.write (str(random_knockouts_pVal)) # output the p value calculated by the get_random_knockout_data method for the current pathway (see above) pFile.write (str(random_genesets_pVal)) # output the p value to a separate file for easy histogram creation later pFile.write ('\t') pFile.write (str (currentBPM)) #output the BPM of the p value to the same file as the get_random_geneset_datas, so finding validated BPMs can be done easily pFile.write ('\t1\n') pFileKO.write(str(random_knockouts_pVal) + '\t' + str(currentBPM) + '\t1\n') def write_pruned(pruned, pruned_common, temp, outputFile, prunedPFile, prunedPFileKO, i, knockout_pathway_common, knockout_pathway_Y, expression_data, knockout_genes_list): if (len(pruned) > 2): CRS_right = compute_rank_cluster_score(temp, pruned) random_genesets_CRS_scores = get_random_geneset_data(temp, pruned) random_knockouts_CRS_scores = get_random_knockout_data(expression_data, knockout_pathway_common, knockout_pathway_Y, pruned_common, pruned, knockout_genes_list) random_genesets_pVal = calculate_p_value(CRS_right, random_genesets_CRS_scores) random_knockouts_pVal = calculate_p_value(CRS_right, random_knockouts_CRS_scores) prunedPFile.write(str(random_genesets_pVal) + '\t' + str(i) + '\t1\n') prunedPFileKO.write(str(random_knockouts_pVal) + '\t' + str(i) + '\t1\n') else: CRS_right = 'N/A' random_genesets_pVal = 'N/A' random_knockouts_pVal = 'N/A' outputFile.write('\n\n\t\t\t\t\t\t\t\t\t\t\t\t\tES Pruned Random Genesets =' + str(CRS_right) + 'p Pruned Random Genesets = ' + str(random_genesets_pVal) + '\n') outputFile.write('\t\t\t\t\t\t\t\t\t\t\t\t\tES Pruned Random KOs =' + str(CRS_right) + '\tp Pruned Random KOs = ' + str(random_knockouts_pVal)) outputFile.write ("<br/><br/>") outputFile.write("<hr/><br/></pre><br/>\n") def KO_output(genelist, bpmList, j, currentBPM, temp, gseaType, outputFile, pFile, pFileKO, prunedPFile, prunedPFileKO, pruneCut, expression_data, knockout_genes_list): CRS_right = compute_rank_cluster_score(temp, bpmList[1][currentBPM]) CRS_left = compute_rank_cluster_score(temp, genelist) random_genesets_CRS_scores = get_random_geneset_data(temp, bpmList[1][currentBPM]) # calculate the get_random_geneset_data from the gsea value of the current pathway random_knockouts_CRS_scores = get_random_knockout_data(expression_data, bpmList[2][currentBPM], bpmList[0][currentBPM], bpmList[3][currentBPM], bpmList[1][currentBPM], knockout_genes_list) random_genesets_pVal = calculate_p_value(CRS_right, random_genesets_CRS_scores) random_knockouts_pVal = calculate_p_value(CRS_right, random_knockouts_CRS_scores) random_genesets_CRS_scoresLeft = get_random_geneset_data(temp, genelist) random_knockouts_CRS_scoresLeft = get_random_knockout_data(expression_data, bpmList[2][currentBPM], bpmList[0][currentBPM], bpmList[2][currentBPM], bpmList[0][currentBPM], knockout_genes_list) random_genesets_pValLeft = calculate_p_value(CRS_left, random_genesets_CRS_scoresLeft) random_knockouts_pValLeft = calculate_p_value(CRS_left, random_knockouts_CRS_scoresLeft) sorted = get_log_sorted_values(temp, gseaType) logs1 = get_gene_ranks_KO(sorted, bpmList, currentBPM, gseaType, 0) logs2 = get_gene_ranks_compensatory(sorted, bpmList, currentBPM, gseaType, 1) names1 = set_genenames_KO(logs1) #names1 and names2 will be used to store the names of the genes in logs1 and logs2, stored with the same indices as their corresponding information in logs1 and logs2, names2 = set_genenames_compensatory(logs2) #so names1 and names2 effectively allow easy access to logs1 and logs2 write_current_knockout_stats(bpmList, currentBPM, j, outputFile, sorted, 0, 1) shorter = determine_shorter_pathway(bpmList, currentBPM) longer = determine_longer_pathway(currentBPM, bpmList) pruned = [] pruned_common = [] a = 0 # a counter that holds the number of iterations through the pathways that have occurred for item in bpmList[shorter][currentBPM]: # for every gene in the shorter pathway of the current BPM found = 0 # a boolean that tells whether or not the gene's log10 data has been found in temp during write_log_and_ranks good = "not yet" if shorter == 0: # always print the side with the knockout on the left, so if the side with the # knockout is shorter, then output current item write_item_KO(bpmList, currentBPM, j, item, outputFile, a, 0) #BIG CHANGE => both a's used to be j's write_items_common_name(bpmList, currentBPM, a, j, outputFile, 2) # CHANGE => outputFile.write ('\t\t\t\t\t') found = 0 for x in range (len (temp)): # for the size of the hughes column if (found != 0): x = len(temp) + 1 else: found = write_log_and_rank(temp, x, currentBPM, bpmList, outputFile, logs1, names1, a, 0) found = 0 outputFile.write ('\t\t\t\t\t') for z in range (len (temp)): # for the size of the hughes column rank = get_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 1) if rank != 0: break if (float(float(rank)/float(len(temp))) > pruneCut): good = "yes" pruned.append(bpmList[longer][currentBPM][a]) pruned_common.append(bpmList[longer + 2][currentBPM][a]) else: good = "no" write_item_compensatory(bpmList, currentBPM, a, item, outputFile, 1, good) write_log_and_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 1) if good == "yes": outputFile.write("</span>") outputFile.write ('<br/>') found = 0 a+=1 # for each iteration, increment a else: # if the shorter list isn't 0, but is in fact 1 write_item_KO(bpmList, currentBPM, j, bpmList[longer][currentBPM][a], outputFile, a, 0) write_items_common_name(bpmList, currentBPM, a, j, outputFile, 2) found = 0 for x in range (len (temp)): # for the size of the hughes column if (found != 0): x = len(temp) + 1 else: found = write_log_and_rank(temp, x, currentBPM, bpmList, outputFile, logs1, names1, a, 0) outputFile.write ('\t\t\t\t\t') found = 0 for z in range (len (temp)): # for the size of the hughes column rank = get_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 1) if rank != 0: break if (float(float(rank)/float(len(temp))) > pruneCut): good = "yes" pruned.append(bpmList[shorter][currentBPM][a]) pruned_common.append(bpmList[shorter + 2][currentBPM][a]) else: good = "no" write_item_compensatory(bpmList, currentBPM, a, item, outputFile, 1, good) found = write_log_and_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 1) if good == "yes": outputFile.write("</span>") outputFile.write ('<br/>') found = 0 a+=1 # increment a # this while loop performs the same operations as above for the longer list, because the above # loop finished the smaller list while a < len(bpmList[longer][currentBPM]): found = 0 good = "not yet" if longer == 1: # if longer is 1, then it doesn't contain the knockout, so is on the right, # so it must be moved to align with the pathway 2 information that was written above outputFile.write ('\t\t\t\t\t\t\t\t\t\t\t\t\t') found = 0 for z in range (len (temp)): # for the size of the hughes column rank = get_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 1) if rank != 0: break if (float(float(rank)/float(len(temp))) > pruneCut): good = "yes" pruned.append(bpmList[longer][currentBPM][a]) pruned_common.append(bpmList[longer + 2][currentBPM][a]) else: good = "no" write_item_compensatory(bpmList, currentBPM, a, bpmList[longer][currentBPM][a], outputFile, 1, good) #BIG CHANGE => changed item to bpmList[longer][currentBPM][a] for z in range (len (temp)): # for the size of the hughes column if (found != 0): z = len(temp) + 1 else: found = write_log_and_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 1) #CHANGE => 1 to 2 if good == "yes": outputFile.write("</span>") found = 0 outputFile.write ('<br/>') a+=1 # increment a else: # if longer is shorter, and therefore contains the knockout write_item_KO(bpmList, currentBPM, j, bpmList[longer][currentBPM][a], outputFile, a, 0) write_items_common_name(bpmList, currentBPM, a, j, outputFile, 2) found = 0 for x in range (len (temp)): # for the size of the hughes column if (found != 0): x = len(temp) + 1 else: found = write_log_and_rank(temp, x, currentBPM, bpmList, outputFile, logs1, names1, a, 0) found = 0 outputFile.write ('<br/>') a+=1 write_scores(outputFile, pFile, pFileKO, CRS_left, CRS_right, random_genesets_pValLeft, random_genesets_pVal, random_knockouts_pValLeft, random_knockouts_pVal, currentBPM) write_pruned(pruned, pruned_common, temp, outputFile, prunedPFile, prunedPFileKO, currentBPM, bpmList[2][currentBPM], bpmList[0][currentBPM], expression_data, knockout_genes_list) def compensatory_output(genelist, bpmList, k, currentBPM, temp, gseaType, outputFile, pFile, pFileKO, prunedPFile, prunedPFileKO, pruneCut, expression_data, knockout_genes_list): CRS_right = compute_rank_cluster_score(temp, bpmList[0][currentBPM]) CRS_left = compute_rank_cluster_score(temp, genelist) random_genesets_CRS_scores = get_random_geneset_data(temp, bpmList[0][currentBPM]) # calculate the get_random_geneset_data from the gsea value of the current pathway random_knockouts_CRS_scores = get_random_knockout_data(expression_data, bpmList[3][currentBPM], bpmList[1][currentBPM], bpmList[2][currentBPM], bpmList[0][currentBPM], knockout_genes_list) random_genesets_pVal = calculate_p_value(CRS_right, random_genesets_CRS_scores) random_knockouts_pVal = calculate_p_value(CRS_right, random_knockouts_CRS_scores) random_genesets_CRS_scoresLeft = get_random_geneset_data(temp, genelist) random_knockouts_CRS_scoresLeft = get_random_knockout_data(expression_data, bpmList[3][currentBPM], bpmList[1][currentBPM], bpmList[3][currentBPM], bpmList[1][currentBPM], knockout_genes_list) random_genesets_pValLeft = calculate_p_value(CRS_left, random_genesets_CRS_scores) random_knockouts_pValLeft = calculate_p_value(CRS_left, random_knockouts_CRS_scores) sorted = get_log_sorted_values(temp, gseaType) logs1 = get_gene_ranks_KO(sorted, bpmList, currentBPM, gseaType, 1) logs2 = get_gene_ranks_compensatory(sorted, bpmList, currentBPM, gseaType, 0) names1 = set_genenames_KO(logs1) #names1 and names2 will be used to store the names of the genes in logs1 and logs2, stored with the same indices as their corresponding information in logs1 and logs2, names2 = set_genenames_compensatory(logs2) #so names1 and names2 effectively allow easy access to logs1 and logs2 write_current_knockout_stats(bpmList, currentBPM, k, outputFile, sorted, 1, 0) shorter = determine_shorter_pathway(bpmList, currentBPM) longer = determine_longer_pathway(currentBPM, bpmList) pruned = [] pruned_common = [] a = 0 for item in bpmList[shorter][currentBPM]: found = 0 good = "not yet" if shorter == 1: write_item_KO(bpmList, currentBPM, k, bpmList[1][currentBPM][a], outputFile, a, 1) #BIG CHANGE => a used to be k, bpmList[1][currentBPM][a] used to be item write_items_common_name(bpmList, currentBPM, a, k, outputFile, 3) found = 0 for x in range (len (temp)): if (found == 1): x = len(temp) + 1 else: found = write_log_and_rank(temp, x, currentBPM, bpmList, outputFile, logs1, names1, a, 1) found = 0 outputFile.write ('\t\t\t\t\t') for z in range (len (temp)): # for the size of the hughes column rank = get_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 0) if rank != 0: break if (float(float(rank)/float(len(temp))) > pruneCut): good = "yes" pruned.append(bpmList[longer][currentBPM][a]) pruned_common.append(bpmList[longer + 2][currentBPM][a]) else: good = "no" write_item_compensatory(bpmList, currentBPM, a, item, outputFile, 0, good) write_log_and_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 0) if good == "yes": outputFile.write("</span>") found = 0 outputFile.write ('<br/>') a+=1 else: write_item_KO(bpmList, currentBPM, k, bpmList[1][currentBPM][a], outputFile, a, 1)#BIG CHANGE => a used to be k write_items_common_name(bpmList, currentBPM, a, k, outputFile, 3) found = 0 for x in range (len (temp)): if (found == 1): x = len(temp) + 1 else: write_log_and_rank(temp, x, currentBPM, bpmList, outputFile, logs1, names1, a, 1) found = 0 outputFile.write ('\t\t\t\t\t') for z in range (len (temp)): # for the size of the hughes column rank = get_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 0) if rank != 0: break if (float(float(rank)/float(len(temp))) > pruneCut): good = "yes" pruned.append(bpmList[shorter][currentBPM][a]) pruned_common.append(bpmList[shorter + 2][currentBPM][a]) else: good = "no" write_item_compensatory(bpmList, currentBPM, a, item, outputFile, 0, good) found = write_log_and_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 0) if good == "yes": outputFile.write("</span>") found = 0 outputFile.write ('<br/>') a+=1 for a in range (a, len(bpmList[longer][currentBPM])): #BIG CHANGE => changed from range(len(bpmList[longer][currentBPM])) found = 0 good = "not yet" if longer == 0: outputFile.write ('\t\t\t\t\t\t\t\t\t\t\t\t\t') for z in range (len (temp)): # for the size of the hughes column rank = get_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 0) if rank != 0: break if (float(float(rank)/float(len(temp))) > pruneCut): good = "yes" pruned.append(bpmList[longer][currentBPM][a]) pruned_common.append(bpmList[longer + 2][currentBPM][a]) else: good = "no" write_item_compensatory(bpmList, currentBPM, a, bpmList[longer][currentBPM][a], outputFile, 0, good) #BIG CHANGE => bpmList[longer][currentBPM][a] used to be item found = 0 found = write_log_and_rank(temp, z, currentBPM, bpmList, outputFile, logs2, names2, a, 0) if good == "yes": outputFile.write("</span>") found = 0 outputFile.write ('<br/>') a+=1 else: write_item_KO(bpmList, currentBPM, k, bpmList[longer][currentBPM][a], outputFile, a, 1) #BIG CHANGE => a used to be k write_items_common_name(bpmList, currentBPM, a, k, outputFile, 3) found = 0 for x in range (len (temp)): if (found == 1): x = len(temp) + 1 else: found = write_log_and_rank(temp, x, currentBPM, bpmList, outputFile, logs1, names1, a, 1) found = 0 outputFile.write ('<br/>') a+=1 write_scores(outputFile, pFile, pFileKO, CRS_left, CRS_right, random_genesets_pValLeft, random_genesets_pVal, random_knockouts_pValLeft, random_knockouts_pVal, currentBPM) write_pruned(pruned, pruned_common, temp, outputFile, prunedPFile, prunedPFileKO, currentBPM, bpmList[3][currentBPM], bpmList[1][currentBPM], expression_data, knockout_genes_list) def run(): response = determine_file_path() expression_file_string = raw_input('\nPlease enter the file path to the expression data you would like to use.\n') if (expression_file_string == "hughes"): expression_file_string = "./Expression Data/data_expts1-300_meas.txt" expression_data = open(expression_file_string, 'r') gseaType = '3' pruneCut = raw_input("\nWhat would you like to set the pruning cutoff to?\n") file_strings = get_output_file_strings(gseaType) knock = getKnockouts(expression_data) knockout_genes_list = knock[3] bpmList = create_bpm_list(response, knock) # the bpmList contains all the information gathered above in one list, where columns [0] & [1] contain # the genenames of pathways 1 and 2 for all BPMs, columns [2] & [3] contain the common names of pathways # 1 and 2 for all BPMs, columns [4] & [5] contain the test locations of all the genes in pathways 1 and # 2 of each BPM that are denoted by their genename, and columns [6] & [7] contain the test locations of # all the genes in pathways 1 and 2 of each BPM that are denoted by their common name ourList = ourGeneList(bpmList) theirList = theirListLocations(ourList, knock[0]) pFile = open(file_strings[0], 'w') outputFile = open(file_strings[1], 'w') prunedPFile = open (file_strings[2], 'w') pFileKO = open(file_strings[3], 'w') prunedPFileKO = open(file_strings[4], 'w') write_outputfile_header(outputFile, pruneCut, response) print 'Beginning...' for i in range(0, len(bpmList[0])): # for every BPM print print '\tBPM ', str(i), '...' print currentBPM = i write_current_bpm_header(currentBPM, outputFile) for j in range(0, len(bpmList[0][currentBPM])): # for every gene in pathway 1 of the current BPM print '\t\tLooking at gene ', bpmList[0][currentBPM][j], ' for possible KO output' if ((bpmList[4][currentBPM][j] != -1) and (len(bpmList[1][currentBPM]) > 2)): # if there is a test for the current gene, and the length of the pathway on the opposite side is greater than 2 genelist = initialize_genelist(bpmList, j, 0, currentBPM) temp0 = getHughesColumn(expression_data, bpmList[4][currentBPM][j]) # get the corresponding column from the hughes file print '\t\tUsing gene ', bpmList[0][currentBPM][j], ' as the KO' KO_output(genelist, bpmList, j, currentBPM, temp0, gseaType, outputFile, pFile, pFileKO, prunedPFile, prunedPFileKO, float(pruneCut), expression_data, knockout_genes_list) elif ((bpmList[6][currentBPM][j] != -1) and (len(bpmList[1][currentBPM]) > 2)): # if there was no test denoted by a genename, check if there is a test denoted by a common name for the # current gene in pathway 1 of the current BPM, and make sure the opposite pathway has length > 2 genelist = initialize_genelist(bpmList, j, 0, currentBPM) temp1 = getHughesColumn(expression_data, bpmList[6][currentBPM][j]) print '\t\tUsing gene ', bpmList[0][currentBPM][j], ' as the KO' KO_output(genelist, bpmList, j, currentBPM, temp1, gseaType, outputFile, pFile, pFileKO, prunedPFile, prunedPFileKO, float(pruneCut), expression_data, knockout_genes_list) print '\t\tFinished First Pathway...' print for k in range(0, len(bpmList[1][currentBPM])): # for every gene in pathway 2 in the current BPM print '\t\tLooking at gene ', bpmList[1][currentBPM][k], ' for possible compensatory output' if ((bpmList[5][currentBPM][k] != -1) and (len(bpmList[0][currentBPM]) > 2)): # if there is a test denoted by genename for the current gene and the opposite pathway has length > 2 genelist = initialize_genelist(bpmList, k, 1, currentBPM) temp2 = getHughesColumn(expression_data, bpmList[5][currentBPM][k]) print '\t\tUsing gene ', bpmList[1][currentBPM][k], ' as the KO' compensatory_output(genelist, bpmList, k, currentBPM, temp2, gseaType, outputFile, pFile, pFileKO, prunedPFile, prunedPFileKO, float(pruneCut), expression_data, knockout_genes_list) elif ((bpmList[7][currentBPM][k] != -1) and (len(bpmList[0][currentBPM]) > 2)): # if the wasn't a test denoted by genename for the current gene but there is one denoted by common name # and the length of the opposite pathway is greater than 2 genelist = initialize_genelist(bpmList, k, 1, currentBPM) temp3 = getHughesColumn(expression_data, bpmList[7][currentBPM][k]) print '\t\tUsing gene ', bpmList[1][currentBPM][k], ' as the KO' compensatory_output(genelist, bpmList, k, currentBPM, temp3, gseaType, outputFile, pFile, pFileKO, prunedPFile, prunedPFileKO, float(pruneCut), expression_data, knockout_genes_list) print '\t\tFinished Second Pathway...' outputFile.write ('</p>') outputFile.close() print '********************************************' run()